{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为了演示LightGBM在蟒蛇中的用法，本代码以sklearn包中自带的鸢尾花数据集为例，用lightgbm算法实现鸢尾花种类的分类任务。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import lightgbm as lgb\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.datasets import  make_classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start training...\n",
      "[1]\tvalid_0's auc: 1\tvalid_0's l2: 0.625393\n",
      "Training until validation scores don't improve for 5 rounds.\n",
      "[2]\tvalid_0's auc: 1\tvalid_0's l2: 0.567814\n",
      "[3]\tvalid_0's auc: 1\tvalid_0's l2: 0.515827\n",
      "[4]\tvalid_0's auc: 1\tvalid_0's l2: 0.466764\n",
      "[5]\tvalid_0's auc: 1\tvalid_0's l2: 0.424377\n",
      "[6]\tvalid_0's auc: 1\tvalid_0's l2: 0.385323\n",
      "Early stopping, best iteration is:\n",
      "[1]\tvalid_0's auc: 1\tvalid_0's l2: 0.625393\n",
      "Save model...\n",
      "Start predicting...\n",
      "The rmse of prediction is: 0.7908176179297906\n"
     ]
    }
   ],
   "source": [
    "iris = load_iris()   # 载入鸢尾花数据集\n",
    "data=iris.data\n",
    "target = iris.target\n",
    "X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)\n",
    " \n",
    " \n",
    "# 加载你的数据\n",
    "# print('Load data...')\n",
    "# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\\t')\n",
    "# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\\t')\n",
    "#\n",
    "# y_train = df_train[0].values\n",
    "# y_test = df_test[0].values\n",
    "# X_train = df_train.drop(0, axis=1).values\n",
    "# X_test = df_test.drop(0, axis=1).values\n",
    " \n",
    "# 创建成lgb特征的数据集格式\n",
    "lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快\n",
    "lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # 创建验证数据\n",
    " \n",
    "# 将参数写成字典下形式\n",
    "params = {\n",
    "    'task': 'train',\n",
    "    'boosting_type': 'gbdt',  # 设置提升类型\n",
    "    'objective': 'regression', # 目标函数\n",
    "    'metric': {'l2', 'auc'},  # 评估函数\n",
    "    'num_leaves': 31,   # 叶子节点数\n",
    "    'learning_rate': 0.05,  # 学习速率\n",
    "    'feature_fraction': 0.9, # 建树的特征选择比例\n",
    "    'bagging_fraction': 0.8, # 建树的样本采样比例\n",
    "    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging\n",
    "    'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息\n",
    "}\n",
    " \n",
    "print('Start training...')\n",
    "# 训练 cv and train\n",
    "gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5) # 训练数据需要参数列表和数据集\n",
    " \n",
    "print('Save model...') \n",
    " \n",
    "gbm.save_model('model.txt')   # 训练后保存模型到文件\n",
    " \n",
    "print('Start predicting...')\n",
    "# 预测数据集\n",
    "y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止，可以通过best_iteration方式从最佳迭代中获得预测\n",
    "# 评估模型\n",
    "print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # 计算真实值和预测值之间的均方根误差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
